* ==============================================================================
* PM 2.5 NAPS air quality data cleaning. 
* This file imports raw (excel file) NAPS data into stata, drops variables, and 
* merges into one longitudinal dataset. 

* The original raw data is reported by monitor. This file aggregates across monitors
* within a given CMA/CA using the aggregation directions provided by the Canada 
* Wide Standards. The final dataset produces a variable "PM25_CMA", that gives the 
* CMA/CA's PM 2.5 concentration needed to determine compliance with the CWS. If a
* CMA/CA-year has a PM25_CMA higher than 30 they are not in compliance with the CWS.  

* Note that the folder structure in the raw data folder is preserved in the stata 
* data folder. 

* Note that if using the data files provided with the REStat replication 
* ("StationList_withCMA_SecondJoin.dta" and "PM25_NAPS_Data.dta") please start
* from Step 5. 

* Author: Nouri Najjar
* Created: August 2015
* Last Updated: November 2019
* ==============================================================================

* ------------------------------------------------------------
* Preamble: Define data directories. Must be adjusted accordingly
* ------------------------------------------------------------
* Raw data files
*local datadir_Raw D:\Google Drive\NAPS Data - Public\PM 2.5\Raw Data\

local datadir_Stata D:\REStat Data Archive\AQ Data\

*local datadir_StatList D:\Google Drive\NAPS Data - Public\PM 2.5\

local datadir_GIS D:\REStat Data Archive\AQ Data\

* ------------------------------------------------------------
/*
* ------------------------------------------------------------
* Step 1: Import each excel file into stata
* ------------------------------------------------------------
* BAM data
local files : dir "`datadir_Raw'BAM\" files "*.xls"

foreach file_name in `files' {
	cd "`datadir_Raw'BAM\"
	* Import file
	import excel using `file_name', sheet("24hr") firstrow
	* Drop and rename variables
	drop Elevation Type ID Minimum thPercentile K L M N O P R S Maximum January February March April May June July August September October November December Mean StandardDeviation
	rename Q PM25_BAM
	rename PercentCompleteness PerComp_BAM
	* Create Year variable
	gen Year = substr("`file_name'", 1 , 4 )
	destring(Year),replace
	* Save file
	cd "`datadir_Stata'BAM\"
	save "`file_name'.dta", replace
	clear
}

* BAM35 data
local files : dir "`datadir_Raw'BAM35\" files "*.xls"

foreach file_name in `files' {
	cd "`datadir_Raw'BAM35\"
	* Import file
	import excel using `file_name', sheet("24hr") firstrow
	* Drop and rename variables
	drop Elevation Type ID Minimum thPercentile K L M N O P R S Maximum January February March April May June July August September October November December Mean StandardDeviation
	rename Q PM25_BAM35
	rename PercentCompleteness PerComp_BAM35
	* Create Year variable
	gen Year = substr("`file_name'", 1 , 4 )
	destring(Year),replace
	* Save file
	cd "`datadir_Stata'BAM35\"
	save "`file_name'.dta", replace
	clear
}


* SHARP5030 data
local files : dir "`datadir_Raw'SHARP5030\" files "*.xls"

foreach file_name in `files' {
	cd "`datadir_Raw'SHARP5030\"
	* Import file
	import excel using `file_name', sheet("24hr") firstrow
	* Drop and rename variables
	drop Elevation Type ID Minimum thPercentile K L M N O P R S Maximum January February March April May June July August September October November December Mean StandardDeviation
	rename Q PM25_SHARP5030
	rename PercentCompleteness PerComp_SHARP5030
	* Create Year variable
	gen Year = substr("`file_name'", 1 , 4 )
	destring(Year),replace
	* Save file
	cd "`datadir_Stata'SHARP5030\"
	save "`file_name'.dta", replace
	clear
}

* TEOM data
local files : dir "`datadir_Raw'TEOM\" files "*.xls"

foreach file_name in `files' {
	cd "`datadir_Raw'TEOM\"
	* Import file
	import excel using `file_name', sheet("24hr") firstrow
	* Drop and rename variables
	drop Elevation Type ID Minimum thPercentile K L M N O P R S Maximum January February March April May June July August September October November December Mean StandardDeviation
	rename Q PM25_TEOM
	rename PercentCompleteness PerComp_TEOM
	* Create Year variable
	gen Year = substr("`file_name'", 1 , 4 )
	destring(Year),replace
	* Save file
	cd "`datadir_Stata'TEOM\"
	save "`file_name'.dta", replace
	clear
}


* TEOM - FDMS data
local files : dir "`datadir_Raw'TEOM - FDMS\" files "*.xls"

foreach file_name in `files' {
	cd "`datadir_Raw'TEOM - FDMS\"
	* Import file
	import excel using `file_name', sheet("24hr") firstrow
	* Drop and rename variables
	drop Elevation Type ID Minimum thPercentile K L M N O P R S Maximum January February March April May June July August September October November December Mean StandardDeviation
	rename Q PM25_TEOM_FDMS
	rename PercentCompleteness PerComp_TEOM_FDMS
	* Create Year variable
	gen Year = substr("`file_name'", 1 , 4 )
	destring(Year),replace
	* Save file
	cd "`datadir_Stata'TEOM - FDMS\"
	save "`file_name'.dta", replace
	clear
}


* TEOM - SES data
local files : dir "`datadir_Raw'TEOM - SES\" files "*.xls"

foreach file_name in `files' {
	cd "`datadir_Raw'TEOM - SES\"
	* Import file
	import excel using `file_name', sheet("24hr") firstrow
	* Drop and rename variables
	drop Elevation Type ID Minimum thPercentile K L M N O P R S Maximum January February March April May June July August September October November December Mean StandardDeviation
	rename Q PM25_TEOM_SES
	rename PercentCompleteness PerComp_TEOM_SES
	* Create Year variable
	gen Year = substr("`file_name'", 1 , 4 )
	destring(Year),replace
	* Save file
	cd "`datadir_Stata'TEOM - SES\"
	save "`file_name'.dta", replace
	clear
}

* ------------------------------------------------------------


* ------------------------------------------------------------
* Step 2: Merge into one dataset
* ------------------------------------------------------------
* Initiate longitudinal dataset
use "`datadir_Stata'TEOM\1998_annualpm25_teom.xls.dta"
save "`datadir_Stata'PM25_NAPS_Data.dta", replace

* TEOM data
* Open other datasets and sort
local files : dir "`datadir_Stata'TEOM\" files "*.dta"
cd "`datadir_Stata'TEOM\"

foreach file_name in `files' {
	* open file
	use `file_name'
	* sort data
	sort NAPSID Year
	* Save file
	save `file_name', replace
	clear
}

* Open main dataset and sort on merging variables (NAPSID and Year)
use "`datadir_Stata'PM25_NAPS_Data.dta"
sort NAPSID Year
save "`datadir_Stata'PM25_NAPS_Data.dta", replace

* Merge data
local files : dir "`datadir_Stata'TEOM\" files "*.dta"
cd "`datadir_Stata'TEOM\"

foreach file_name in `files' {
	* merge files and drop merge variable
	merge 1:m NAPSID Year using `file_name'
	drop _merge
}

save "`datadir_Stata'PM25_NAPS_Data.dta", replace

* BAM data
* Open other datasets and sort
local files : dir "`datadir_Stata'BAM\" files "*.dta"
cd "`datadir_Stata'BAM\"

foreach file_name in `files' {
	* open file
	use `file_name'
	* sort data
	sort NAPSID Year
	* Save file
	save `file_name', replace
	clear
}

* Open main dataset and sort on merging variables (NAPSID and Year)
use "`datadir_Stata'PM25_NAPS_Data.dta"
sort NAPSID Year
save "`datadir_Stata'PM25_NAPS_Data.dta", replace

* Merge data
local files : dir "`datadir_Stata'BAM\" files "*.dta"
cd "`datadir_Stata'BAM\"

foreach file_name in `files' {
	* merge files and drop merge variable
	merge 1:m NAPSID Year using `file_name'
	drop _merge
}

save "`datadir_Stata'PM25_NAPS_Data.dta", replace

* BAM35 data
* Open other datasets and sort
local files : dir "`datadir_Stata'BAM\" files "*.dta"
cd "`datadir_Stata'BAM\"

foreach file_name in `files' {
	* open file
	use `file_name'
	* sort data
	sort NAPSID Year
	* Save file
	save `file_name', replace
	clear
}

* Open main dataset and sort on merging variables (NAPSID and Year)
use "`datadir_Stata'PM25_NAPS_Data.dta"
sort NAPSID Year
save "`datadir_Stata'PM25_NAPS_Data.dta", replace

* Merge data
local files : dir "`datadir_Stata'BAM35\" files "*.dta"
cd "`datadir_Stata'BAM35\"

foreach file_name in `files' {
	* merge files and drop merge variable
	merge 1:m NAPSID Year using `file_name'
	drop _merge
}

save "`datadir_Stata'PM25_NAPS_Data.dta", replace

* SHARP5030 data
* Open other datasets and sort
local files : dir "`datadir_Stata'SHARP5030\" files "*.dta"
cd "`datadir_Stata'SHARP5030\"

foreach file_name in `files' {
	* open file
	use `file_name'
	* sort data
	sort NAPSID Year
	* Save file
	save `file_name', replace
	clear
}

* Open main dataset and sort on merging variables (NAPSID and Year)
use "`datadir_Stata'PM25_NAPS_Data.dta"
sort NAPSID Year
save "`datadir_Stata'PM25_NAPS_Data.dta", replace

* Merge data
local files : dir "`datadir_Stata'SHARP5030\" files "*.dta"
cd "`datadir_Stata'SHARP5030\"

foreach file_name in `files' {
	* merge files and drop merge variable
	merge 1:m NAPSID Year using `file_name', force
	drop _merge
}

save "`datadir_Stata'PM25_NAPS_Data.dta", replace

* TEOM - FDMS data
* Open other datasets and sort
local files : dir "`datadir_Stata'TEOM - FDMS\" files "*.dta"
cd "`datadir_Stata'TEOM - FDMS\"

foreach file_name in `files' {
	* open file
	use `file_name'
	* sort data
	sort NAPSID Year
	* Save file
	save `file_name', replace
	clear
}

* Open main dataset and sort on merging variables (NAPSID and Year)
use "`datadir_Stata'PM25_NAPS_Data.dta"
sort NAPSID Year
save "`datadir_Stata'PM25_NAPS_Data.dta", replace

* Merge data
local files : dir "`datadir_Stata'TEOM - FDMS\" files "*.dta"
cd "`datadir_Stata'TEOM - FDMS\"

foreach file_name in `files' {
	* merge files and drop merge variable
	merge 1:m NAPSID Year using `file_name'
	drop _merge
}

save "`datadir_Stata'PM25_NAPS_Data.dta", replace


* TEOM - SES data
* Open other datasets and sort
local files : dir "`datadir_Stata'TEOM - SES\" files "*.dta"
cd "`datadir_Stata'TEOM - SES\"

foreach file_name in `files' {
	* open file
	use `file_name'
	* sort data
	sort NAPSID Year
	* Save file
	save `file_name', replace
	clear
}

* Open main dataset and sort on merging variables (NAPSID and Year)
use "`datadir_Stata'PM25_NAPS_Data.dta"
sort NAPSID Year
save "`datadir_Stata'PM25_NAPS_Data.dta", replace

* Merge data
local files : dir "`datadir_Stata'TEOM - SES\" files "*.dta"
cd "`datadir_Stata'TEOM - SES\"

foreach file_name in `files' {
	* merge files and drop merge variable
	merge 1:m NAPSID Year using `file_name'
	drop _merge
}

save "`datadir_Stata'PM25_NAPS_Data.dta", replace

* ------------------------------------------------------------


* ------------------------------------------------------------
* Step 3: Filter out station-years with incomplete data and 
* compute PM25 score for each station. 
* ------------------------------------------------------------

* Compute inclusion indicators to determine if monitor's data can be used. 
* Requires over 75 percent completeness or below 75% but with a reading above the CWS threshold. 
* Inclusion inditar for TEOM
gen Include_TEOM = 1 if(missing(PerComp_TEOM)==0 & PerComp_TEOM>=75 | (missing(PerComp_TEOM)==0 & PerComp_TEOM<75 & PM25_TEOM>=30))
recode Include_TEOM (.=0)
* Inclusion inditar for BAM
gen Include_BAM = 1 if(missing(PerComp_BAM)==0 & PerComp_BAM>=75 | (missing(PerComp_BAM)==0 & PerComp_BAM<75 & PM25_BAM>=30))
recode Include_BAM (.=0)
* Inclusion inditar for BAM35
gen Include_BAM35 = 1 if(missing(PerComp_BAM35)==0 & PerComp_BAM35>=75 | (missing(PerComp_BAM35)==0 & PerComp_BAM35<75 & PM25_BAM35>=30))
recode Include_BAM35 (.=0)
* Inclusion inditar for SHARP5030
gen Include_SHARP5030 = 1 if(missing(PerComp_SHARP5030)==0 & PerComp_SHARP5030>=75 | (missing(PerComp_SHARP5030)==0 & PerComp_SHARP5030<75 & PM25_SHARP5030>=30))
recode Include_SHARP5030 (.=0)
* Inclusion inditar for TEOM_FDMS
gen Include_TEOM_FDMS = 1 if(missing(PerComp_TEOM_FDMS)==0 & PerComp_TEOM_FDMS>=75 | (missing(PerComp_TEOM_FDMS)==0 & PerComp_TEOM_FDMS<75 & PM25_TEOM_FDMS>=30))
recode Include_TEOM_FDMS (.=0)
* Inclusion inditar for TEOM_SES
gen Include_TEOM_SES = 1 if(missing(PerComp_TEOM_SES)==0 & PerComp_TEOM_SES>=75 | (missing(PerComp_TEOM_SES)==0 & PerComp_TEOM_SES<75 & PM25_TEOM_SES>=30))
recode Include_TEOM_SES (.=0)

* Interact indicator with PM score for monitor to give effective score. This is 
* in computing the site's maximum PM25 reading for a given year. 
gen PM25_TEOM_Effective = PM25_TEOM*Include_TEOM
gen PM25_BAM_Effective = PM25_BAM*Include_BAM
gen PM25_BAM35_Effective = PM25_BAM35*Include_BAM35
gen PM25_SHARP5030_Effective = PM25_SHARP5030*Include_SHARP5030
gen PM25_TEOM_FDMS_Effective = PM25_TEOM_FDMS*Include_TEOM_FDMS
gen PM25_TEOM_SES_Effective = PM25_TEOM_SES*Include_TEOM_SES

* Keep the largest valid reading from the site-year's monitors as the site's score
egen PM25 = rowmax(PM25_TEOM_Effective PM25_BAM_Effective PM25_BAM35_Effective PM25_SHARP5030_Effective PM25_TEOM_FDMS_Effective PM25_TEOM_SES_Effective)

* Recode to indicate no valid monitors for site-year
replace PM25=. if(Include_TEOM==0 & Include_BAM==0 & Include_BAM35==0 & Include_SHARP5030==0 & Include_TEOM_FDMS==0 & Include_TEOM_SES==0)

* Save
save "`datadir_Stata'PM25_NAPS_Data.dta", replace
* ------------------------------------------------------------


* ------------------------------------------------------------
* Step 4: Compute 3-year average for each station from 2000 on.
* ------------------------------------------------------------
* add zeros to missing values for 3-year ave computation
gen PM25_Store = PM25 if PM25!=0
recode PM25_Store (.=0)

* Count number of missing years in previous three years
bysort NAPSID (Year): gen MissCount = missing(PM25[_n]) + missing(PM25[_n-1]) + missing(PM25[_n-2])

* 3 year ave using all three years for sites with enough data to do so
bysort NAPSID (Year): gen PM25_3YearAve = (PM25_Store[_n] + PM25_Store[_n-1] + PM25_Store[_n-2])/3 if MissCount==0
* 3 year ave using two years for sites with enough data to do so
bysort NAPSID (Year): replace PM25_3YearAve = (PM25_Store[_n] + PM25_Store[_n-1] + PM25_Store[_n-2])/2 if MissCount==1
* 3 year ave using 1 year for sites with enough data to do so
bysort NAPSID (Year): replace PM25_3YearAve = (PM25_Store[_n] + PM25_Store[_n-1] + PM25_Store[_n-2]) if MissCount==2

* Save
save "`datadir_Stata'PM25_NAPS_Data.dta", replace
* ------------------------------------------------------------
*/
* ------------------------------------------------------------
* Step 5: merge in CMA/CA
* ------------------------------------------------------------

* Open master file and rename
use "`datadir_Stata'PM25_NAPS_Data.dta"
save "`datadir_Stata'PM25_NAPS_Data_WithCMA.dta", replace
* Sort on NAPSID
sort NAPSID
* Merge in CMA data
merge m:1 NAPSID using "`datadir_GIS'StationList_withCMA_SecondJoin.dta"
drop if _merge==2
drop _merge
* Save
save "`datadir_Stata'PM25_NAPS_Data_WithCMA.dta", replace
* ------------------------------------------------------------


* ------------------------------------------------------------
* Step 6: produce geographic mean by CMA and save at CMA-Year level
* ------------------------------------------------------------
* Compute average for CMA-year
bysort CMAUID Year: egen PM25_CMA = mean(PM25)
* Compute duplicate indicator for CMA-year
bysort CMAUID Year: gen DupCMAYear = cond(_N==1,0,_n)

* Drop duplicate CMA Year
drop if DupCMAYear>1
* Drop missing CMAs
drop if missing(CMANAME)==1

* add zeros to missing values for 3-year ave computation
gen PM25_CMA_Store = PM25_CMA if PM25_CMA!=0
recode PM25_CMA_Store (.=0)

* Count number of missing years in previous three years
bysort CMAUID (Year): gen MissCount_CMA = missing(PM25_CMA[_n]) + missing(PM25_CMA[_n-1]) + missing(PM25_CMA[_n-2]) if DupCMAYear<=1

* 3 year ave using all three years for sites with enough data to do so
bysort CMAUID (Year): gen PM25_CMA_3YearAve = (PM25_CMA_Store[_n] + PM25_CMA_Store[_n-1] + PM25_CMA_Store[_n-2])/3 if MissCount_CMA==0 & DupCMAYear<=1
* 3 year ave using two years for sites with enough data to do so
bysort CMAUID (Year): replace PM25_CMA_3YearAve = (PM25_CMA_Store[_n] + PM25_CMA_Store[_n-1] + PM25_CMA_Store[_n-2])/2 if MissCount_CMA==1 & DupCMAYear<=1
* 3 year ave using 1 year for sites with enough data to do so
bysort CMAUID (Year): replace PM25_CMA_3YearAve = (PM25_CMA_Store[_n] + PM25_CMA_Store[_n-1] + PM25_CMA_Store[_n-2]) if MissCount_CMA==2 & DupCMAYear<=1

* Drop variables
keep CMAUID CMANAME Year PM25_CMA PM25_CMA_3YearAve

* Save
save "`datadir_Stata'PM25_NAPS_Data_WithCMA.dta", replace
* ------------------------------------------------------------
